import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
from sklearn. model_selection import train_test_split
from sklearn. model_selection import RepeatedStratifiedKFold
from sklearn. model_selection import cross_val_score
from sklearn. discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
random.seed(42)
sns.set_theme(style="ticks")
Будем использовать данные по качеству вина (подробное описание смотреть в практике по классификации)
df = pd.read_csv('winequality-red2.csv', delimiter=',')
df = df.assign(quality = df['quality'] >= 6)
print(df.head())
print(df.shape)
fixed acidity volatile acidity citric acid residual sugar chlorides \ 0 7.4 0.70 0.00 1.9 0.076 1 7.8 0.88 0.00 2.6 0.098 2 7.8 0.76 0.04 2.3 0.092 3 11.2 0.28 0.56 1.9 0.075 4 7.4 0.70 0.00 1.9 0.076 free sulfur dioxide total sulfur dioxide density pH sulphates \ 0 11.0 34.0 0.9978 3.51 0.56 1 25.0 67.0 0.9968 3.20 0.68 2 15.0 54.0 0.9970 3.26 0.65 3 17.0 60.0 0.9980 3.16 0.58 4 11.0 34.0 0.9978 3.51 0.56 alcohol quality 0 9.4 False 1 9.8 False 2 9.8 False 3 9.8 True 4 9.4 False (1599, 12)
df1 = df[['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality']]
df1_train, df1_test = train_test_split(df1, test_size=0.2)
Проведем предобработку данных:
plt.show(sns.pairplot(df1_train.iloc[:, :-1]))
plt.show(sns.pairplot(df1_train.iloc[:, :-1], kind="kde"))
df1_train.insert(1, 'fixed_acidity_log', np.log(df1_train['fixed acidity']))
df1_train.insert(1, 'residual_sugar_log', np.log(df1_train['residual sugar']))
df1_train.insert(1, 'volatile_acidity', df1_train['volatile acidity'])
df1_train.insert(1, 'chlorides_log', np.log(df1_train['chlorides']))
df1_train.insert(1, 'total_sulfur_dioxide_log', np.log(df1_train['total sulfur dioxide']))
df1_train.insert(1, 'pH_log', np.log(df1_train['pH']))
df1_train.insert(1, 'sulphates_log', np.log(df1_train['sulphates']))
df1_train.insert(1, 'alcohol_log', np.log(df1_train['alcohol']))
df1_train = df1_train.drop(['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol'], axis = 1)
plt.show(sns.pairplot(df1_train, hue = 'quality', diag_kind='hist'))
df1_train = df1_train.drop(df1_train[df1_train.chlorides_log < -4].index)
df1_train = df1_train.drop(df1_train[df1_train.total_sulfur_dioxide_log > 5.1].index)
df1_train = df1_train.drop(df1_train[df1_train.alcohol_log > 2.7].index)
plt.show(sns.pairplot(df1_train, hue = 'quality', diag_kind='hist'))
df1_test.insert(1, 'fixed_acidity_log', np.log(df1_test['fixed acidity']))
df1_test.insert(1, 'residual_sugar_log', np.log(df1_test['residual sugar']))
df1_test.insert(1, 'volatile_acidity', df1_test['volatile acidity'])
df1_test.insert(1, 'chlorides_log', np.log(df1_test['chlorides']))
df1_test.insert(1, 'total_sulfur_dioxide_log', np.log(df1_test['total sulfur dioxide']))
df1_test.insert(1, 'pH_log', np.log(df1_test['pH']))
df1_test.insert(1, 'sulphates_log', np.log(df1_test['sulphates']))
df1_test.insert(1, 'alcohol_log', np.log(df1_test['alcohol']))
df1_test = df1_test.drop(['fixed acidity', 'volatile acidity','residual sugar', 'chlorides', 'total sulfur dioxide', 'pH', 'sulphates', 'alcohol'], axis = 1)
X_train = df1_train.iloc[:, :-1].to_numpy()
X_test = df1_test.iloc[:, :-1].to_numpy()
y_train = df1_train['quality'].astype(int).to_numpy()
y_test = df1_test['quality'].astype(int).to_numpy()
Обучим модель при фиксированных гиперпараметрах. И построим графики зависимости ошибки на train и validated выборках от номера эпохи.
mlp = MLPClassifier(hidden_layer_sizes=(1000,), activation='relu',
solver='adam',
alpha=1e-4, verbose=False, learning_rate_init=.0001, max_iter=1, warm_start=True, early_stopping = False)
mlp_score1 = np.array([])
mlp_score2 = np.array([])
X_train1, X_train2, y_train1, y_train2 = train_test_split(X_train, y_train, train_size=0.75, stratify=y_train)
for i in range(1000):
mlp.fit(X_train1, y_train1)
mlp_score1 = np.append(mlp_score1, mlp.score(X_train1, y_train1))
mlp_score2 = np.append(mlp_score2, mlp.score(X_train2, y_train2))
plt.figure()
plt.plot(1 - mlp_score1)
plt.plot(1 - mlp_score2)
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','val'], loc = 'upper right')
plt.show()
/opt/anaconda3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:702: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1) reached and the optimization hasn't converged yet. warnings.warn(
В определенный момент, примерно на 500 эпохе модель показывает результаты хуже на validated выборке, так как сильно подстроена под test.
Проверим качество классификации для 3 вариантов: без стандартизации, mean normalization, min-max normalization.
GRID = [
{'estimator': [MLPClassifier()],
'estimator__solver': ['adam', 'sgd'],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__hidden_layer_sizes': [(300,), (250,), (200,), (150,)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [1e-4, 1e-3],
'estimator__early_stopping': [True, False]
}
]
PIPELINE = Pipeline([('scaler', None), ('estimator', MLPClassifier())])
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID,
scoring=make_scorer(accuracy_score),# average='macro'),
n_jobs=-1, cv=5, refit=True, verbose=1,
return_train_score=True)
random.seed(1)
grid_search.fit(X_train, y_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(hidden_layer_sizes=(250,),
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(hidden_layer_sizes=(250,),
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])None
MLPClassifier()
print(grid_search.score(X_train, y_train))
print(grid_search.best_params_)
0.7521568627450981
{'estimator': MLPClassifier(hidden_layer_sizes=(250,), max_iter=500), 'estimator__activation': 'relu', 'estimator__alpha': 0.0001, 'estimator__early_stopping': False, 'estimator__hidden_layer_sizes': (250,), 'estimator__learning_rate_init': 0.001, 'estimator__max_iter': 500, 'estimator__solver': 'adam'}
print("Test set score: %f" % grid_search.score(X_test, y_test))
Test set score: 0.750000
Без нормализации accuracy = 0.75
dff = pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_test)])
dff1 = pd.concat([pd.DataFrame(y_train), pd.DataFrame(y_test)])
normalizedStd_df = (dff-dff.mean())/dff.std()
normalizedStd_df['quality'] = dff1
normalizedMinMax_df=(dff-dff.min())/(dff.max()-dff.min())
normalizedMinMax_df['quality'] = dff1
df_normalizedStd_train, df_normalizedStd_test = train_test_split(normalizedStd_df, test_size=0.2)
df_normalizedMinMax_train, df_normalizedMinMax_test = train_test_split(normalizedMinMax_df, test_size=0.2)
X_normalizedStd_train = df_normalizedStd_train.iloc[:, :-1].to_numpy()
X_normalizedStd_test = df_normalizedStd_test.iloc[:, :-1].to_numpy()
y_normalizedStd_train = df_normalizedStd_train['quality'].astype(int).to_numpy()
y_normalizedStd_test = df_normalizedStd_test['quality'].astype(int).to_numpy()
random.seed(1)
grid_search.fit(X_normalizedStd_train, y_normalizedStd_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(activation='logistic',
hidden_layer_sizes=(200,),
learning_rate_init=0.1,
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(activation='logistic',
hidden_layer_sizes=(200,),
learning_rate_init=0.1,
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])None
MLPClassifier()
print(grid_search.score(X_normalizedStd_train, y_normalizedStd_train))
print(grid_search.best_params_)
0.9984326018808778
{'estimator': MLPClassifier(activation='logistic', hidden_layer_sizes=(200,),
learning_rate_init=0.1, max_iter=500), 'estimator__activation': 'logistic', 'estimator__alpha': 0.0001, 'estimator__early_stopping': False, 'estimator__hidden_layer_sizes': (200,), 'estimator__learning_rate_init': 0.1, 'estimator__max_iter': 500, 'estimator__solver': 'adam'}
print("Test set score: %f" % grid_search.score(X_normalizedStd_test, y_normalizedStd_test))
Test set score: 0.780564
При mean normalization accuracy = 0.78
X_normalizedMinMax_train = df_normalizedMinMax_train.iloc[:, :-1].to_numpy()
X_normalizedMinMax_test = df_normalizedMinMax_test.iloc[:, :-1].to_numpy()
y_normalizedMinMax_train = df_normalizedMinMax_train['quality'].astype(int).to_numpy()
y_normalizedMinMax_test = df_normalizedMinMax_test['quality'].astype(int).to_numpy()
random.seed(1)
grid_search.fit(X_normalizedMinMax_train, y_normalizedMinMax_train)
Fitting 5 folds for each of 288 candidates, totalling 1440 fits
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
hidden_layer_sizes=(250,),
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,
estimator=Pipeline(steps=[('scaler', None),
('estimator', MLPClassifier())]),
n_jobs=-1,
param_grid=[{'estimator': [MLPClassifier(alpha=0.001,
hidden_layer_sizes=(250,),
max_iter=500)],
'estimator__activation': ['logistic', 'relu', 'tanh'],
'estimator__alpha': [0.0001, 0.001],
'estimator__early_stopping': [True, False],
'estimator__hidden_layer_sizes': [(300,), (250,),
(200,), (150,)],
'estimator__learning_rate_init': [0.1, 0.001, 0.0001],
'estimator__max_iter': [500],
'estimator__solver': ['adam', 'sgd']}],
return_train_score=True, scoring=make_scorer(accuracy_score),
verbose=1)Pipeline(steps=[('scaler', None), ('estimator', MLPClassifier())])None
MLPClassifier()
print(grid_search.score(X_normalizedMinMax_train, y_normalizedMinMax_train))
print(grid_search.best_params_)
0.7719435736677116
{'estimator': MLPClassifier(alpha=0.001, hidden_layer_sizes=(250,), max_iter=500), 'estimator__activation': 'relu', 'estimator__alpha': 0.001, 'estimator__early_stopping': False, 'estimator__hidden_layer_sizes': (250,), 'estimator__learning_rate_init': 0.001, 'estimator__max_iter': 500, 'estimator__solver': 'adam'}
print("Test set score: %f" % grid_search.score(X_normalizedMinMax_test, y_normalizedMinMax_test))
Test set score: 0.739812
При min-max normalization accuracy = 0.74
Лучшие результаты показала mean normalization.